Packages & Read in Data
#Loading pre-installed libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(gganimate)
## Warning: package 'gganimate' was built under R version 4.0.5
library(cowplot)
## Warning: package 'cowplot' was built under R version 4.0.5
library(ggridges)
## Warning: package 'ggridges' was built under R version 4.0.5
library(repr)
## Warning: package 'repr' was built under R version 4.0.5
library(gifski)
## Warning: package 'gifski' was built under R version 4.0.5
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.5
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
#turning off warnings
options(warn=-1)
#set directory
setwd("/Users/liamd/Documents/NFL Big Data Bowl/Data/")
##reading in non-tracking data
#includes play-by-play info on specific plays
df_plays <- read_csv("plays.csv",
col_types = cols())
#includes background info for players
df_players <- read_csv("players.csv",
col_types = cols())
head(df_plays)
## # A tibble: 6 x 25
## gameId playId playDescription quarter down yardsToGo possessionTeam
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <chr>
## 1 2018090600 37 J.Elliott kicks 65 y~ 1 0 0 PHI
## 2 2018090600 366 (9:20) C.Johnston pu~ 1 4 4 PHI
## 3 2018090600 658 (5:03) M.Bryant 21 y~ 1 4 3 ATL
## 4 2018090600 677 M.Bosher kicks 64 ya~ 1 0 0 ATL
## 5 2018090600 872 (:33) C.Johnston pun~ 1 4 18 PHI
## 6 2018090600 973 (14:18) M.Bosher pun~ 2 4 19 ATL
## # ... with 18 more variables: specialTeamsPlayType <chr>,
## # specialTeamsResult <chr>, kickerId <dbl>, returnerId <chr>,
## # kickBlockerId <dbl>, yardlineSide <chr>, yardlineNumber <dbl>,
## # gameClock <time>, penaltyCodes <chr>, penaltyJerseyNumbers <chr>,
## # penaltyYards <dbl>, preSnapHomeScore <dbl>, preSnapVisitorScore <dbl>,
## # passResult <chr>, kickLength <dbl>, kickReturnYardage <dbl>,
## # playResult <dbl>, absoluteYardlineNumber <dbl>
head(df_players)
## # A tibble: 6 x 7
## nflId height weight birthDate collegeName Position displayName
## <dbl> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 42901 6-1 208 1992-07-25 James Madison SS Dean Marlowe
## 2 43501 6-0 220 1994-08-11 Central Michigan FS Kavon Frazier
## 3 43534 6-0 205 1993-06-02 Illinois SS Clayton Fejedelem
## 4 43535 6-1 235 1992-12-22 Temple MLB Tyler Matakevich
## 5 44174 6-1 236 1993-02-22 Texas Tech ILB Sam Eguavoen
## 6 44931 6-4 221 1993-09-16 North Carolina WR Mack Hollins
##Reading tracking data (needs to be done iteratively)
#weeks of NFL season
seasons <- seq(2018, 2020)
#blank dataframe to store tracking data
df_tracking <- data.frame()
#iterating through all weeks
for(s in seasons){
#temperory dataframe used for reading season for given iteration
df_tracking_temp <- read_csv(paste0("tracking",s,".csv"),
col_types = cols())
#storing temporary dataframe in full season dataframe
df_tracking <- bind_rows(df_tracking_temp, df_tracking)
}
head(df_tracking)
## # A tibble: 6 x 18
## time x y s a dis o dir event nflId
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 2021-01-03 18:03:02 61.2 46.8 0.08 0.13 0.01 186. 145. None 42901
## 2 2021-01-03 18:03:02 61.2 46.8 0.11 0.19 0.01 185. 126. None 42901
## 3 2021-01-03 18:03:02 61.2 46.8 0.1 0.18 0.01 183. 109. None 42901
## 4 2021-01-03 18:03:02 61.2 46.8 0.11 0.22 0.01 180. 91.6 None 42901
## 5 2021-01-03 18:03:02 61.2 46.8 0.05 0.2 0.01 172. 120. None 42901
## 6 2021-01-03 18:03:02 61.2 46.8 0.18 0.8 0.01 169. 237. None 42901
## # ... with 8 more variables: displayName <chr>, jerseyNumber <dbl>,
## # position <chr>, team <chr>, frameId <dbl>, gameId <dbl>, playId <dbl>,
## # playDirection <chr>
rm(df_tracking_temp)
Cleaning Data
#Standardizing tracking data so its always in direction of punting team.
df_tracking <- df_tracking %>%
mutate(x = ifelse(playDirection == "left", 120-x, x),
y = ifelse(playDirection == "left", 160/3 - y, y))
#Subset for punts
df_plays_punts <- df_plays %>%
filter(specialTeamsPlayType == 'Punt')
#Add kicker and returner names
df_plays_punts <- merge(x = df_plays_punts, y = df_players, by.x = "kickerId", by.y = "nflId", all.x = TRUE)
df_plays_punts <- merge(x = df_plays_punts, y = df_players, by.x = "returnerId", by.y = "nflId", all.x = TRUE)
df_plays_punts <- df_plays_punts %>%
rename(kickerName = displayName.x) %>%
rename(returnerName = displayName.y)
#Change 'OAK' and 'LV' to be 'OAK/LV' since the Raiders moved
df_plays_punts <- df_plays_punts %>%
mutate(possessionTeam = ifelse(possessionTeam == 'OAK', 'OAK_LV', possessionTeam)) %>%
mutate(possessionTeam = ifelse(possessionTeam == 'LV', 'OAK_LV', possessionTeam))
#Subset for Panthers
#df_plays_panthers <- df_plays_punts %>%
# filter(possessionTeam == 'CAR')
#Subset for Patriots
#df_plays_patriots <- df_plays_punts %>%
# filter(possessionTeam == 'NE')
Summary Data on Punt Return Yardage
#---------------------------- TEAMS ----------------------------------
#Average punt return yardage by kicking team (lower value better for the team)
df_avg_punt_team <- df_plays_punts %>%
group_by(possessionTeam) %>%
summarise(avgPuntReturn = mean(kickReturnYardage, na.rm = TRUE)) %>%
rename(kickingTeam = possessionTeam) %>%
arrange(avgPuntReturn)
## Get obs count where the punt was returned
team_count <- df_plays_punts %>%
count(possessionTeam, specialTeamsResult == "Return")
team_count <- team_count %>%
filter(`specialTeamsResult == "Return"` == TRUE) %>%
select(-`specialTeamsResult == "Return"`)
## Merge datasets
df_avg_punt_team <- merge(x = df_avg_punt_team, y = team_count, by.x = "kickingTeam", by.y = "possessionTeam", all.x = TRUE)
## Plot
ggplot(data = df_avg_punt_team, aes(x = avgPuntReturn, y = n, label = kickingTeam, color = -avgPuntReturn)) +
geom_point() +
scale_x_reverse() +
geom_text(aes(label = kickingTeam), hjust = -.3) +
scale_color_gradientn(colors = rainbow(5)) +
ggtitle("Average Punt Return Against by Team") +
ylab("Number of Punts") +
xlab("Average Punt Return Against") +
theme_minimal()

#---------------------------- KICKERS ----------------------------------#
#Average punt return yardage by kicker (lower value better)
df_avg_punt_kicker <- df_plays_punts %>%
group_by(kickerName) %>%
summarise(avgPuntReturn = mean(kickReturnYardage, na.rm = TRUE)) %>%
arrange(avgPuntReturn)
df_avg_punt_kicker
## # A tibble: 54 x 2
## kickerName avgPuntReturn
## <chr> <dbl>
## 1 Jake Elliott 0
## 2 Ryan Winslow 1.5
## 3 Drew Kaser 3.8
## 4 Thomas Morstead 5.10
## 5 Sterling Hofrichter 5.4
## 6 Jack Fox 5.5
## 7 Dustin Colquitt 5.81
## 8 Logan Cooke 6.26
## 9 Kevin Huber 6.34
## 10 Rigoberto Sanchez 6.48
## # ... with 44 more rows
## Get obs count
kicker_count <- df_plays_punts %>%
count(kickerName, specialTeamsResult == "Return")
kicker_count <- kicker_count %>%
filter(`specialTeamsResult == "Return"` == TRUE) %>%
select(-`specialTeamsResult == "Return"`)
## Merge datasets
df_avg_punt_kicker <- merge(x = df_avg_punt_kicker, y = kicker_count, by.x = "kickerName", by.y = "kickerName", all.x = TRUE)
## Plot
kicker_plot <- ggplot(data = df_avg_punt_kicker, aes(x = avgPuntReturn, y = n, text = kickerName, color = -avgPuntReturn)) +
geom_point() +
scale_x_reverse() +
scale_color_gradientn(colors = rainbow(5)) +
ggtitle("Average Punt Return Against by Kicker") +
ylab("Number of Punts") +
xlab("Average Punt Return Against") +
theme_minimal()
ggplotly(kicker_plot, tooltip = c("text", "x", "y"))
#---------------------------- RETURNERS ----------------------------------#
#Average punt return yardage by returner (higher value better)
df_avg_punt_returner <- df_plays_punts %>%
group_by(returnerName) %>%
summarise(avgPuntReturn = mean(kickReturnYardage, na.rm = TRUE)) %>%
arrange(desc(avgPuntReturn))
df_avg_punt_returner
## # A tibble: 185 x 2
## returnerName avgPuntReturn
## <chr> <dbl>
## 1 Jalen Reagor 31.3
## 2 Corey Coleman 19
## 3 Keelan Cole 15.6
## 4 Nyheim Hines 15.0
## 5 Gunner Olszewski 13.5
## 6 Brandon Aiyuk 13
## 7 Cyrus Jones 12.6
## 8 Jakeem Grant 12.5
## 9 Isaiah Rodgers 12
## 10 Malcolm Perry 12
## # ... with 175 more rows
## Get obs count
returner_count <- df_plays_punts %>%
count(returnerName, specialTeamsResult == "Return")
returner_count <- returner_count %>%
filter(`specialTeamsResult == "Return"` == TRUE) %>%
select(-`specialTeamsResult == "Return"`)
## Merge datasets
df_avg_punt_returner <- merge(x = df_avg_punt_returner, y = returner_count, by.x = "returnerName", by.y = "returnerName", all.x = TRUE)
df_avg_punt_returner <- na.omit(df_avg_punt_returner)
## Plot
returner_plot <- ggplot(data = df_avg_punt_returner, aes(x = avgPuntReturn, y = n, text = returnerName, color = avgPuntReturn)) +
geom_point() +
scale_color_gradientn(colors = rainbow(5)) +
ggtitle("Average Punt Return by Returner") +
ylab("Number of Punts") +
xlab("Average Punt Return") +
theme_minimal()
ggplotly(returner_plot, tooltip = c("text", "x", "y"))
# Remove unnecessary variables
rm(list = c('kicker_count', 'team_count', 'returner_count'))